This document demonstrates the use of the weightedRF and weightedLASSO functions for integrative GRN inference.

It shows how to select one optimal alpha value per gene, in order to apply data integration only to the target genes for which expression prediction error is reduced by using PWM prior information, significantly more than in a baseline model.

Data import

Import of the expression data and the N-responsive genes and regulators :

load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426   45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426  201
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5

ALPHAS <- seq(0,1, by = 0.1)

Prediction error as a function of data integration strengh (alpha)

The following steps are time and CPU intensive, so the result files can just be loaded to be analysed in further steps.

For weightedRF

Generating 100 repetitions of weightedRF MSE estimation on true data, and on shuffled data.

lmses <- data.frame(row.names = genes)
N <-100
for(alpha in ALPHAS){
  for(perm in 1:N){
    lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
                             pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = F)
  }

  for(perm in 1:N){
    lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 2000,
                           pwm_occurrence = pwm_occurrence, nCores = 45, tf_expression_permutation = T)
  }
}

save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")
subset<-unique(rownames(lmses))

For weightedLASSO

Generating 100 repetitions of weightedLASSO MSE estimation on true data, and on shuffled data.

# lmses <- data.frame(row.names = genes)
# N<-50
# for(alpha in ALPHAS){
#   for(perm in 1:N){
#     lmses[,paste(as.character(alpha), perm, "true_data")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
#                              pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = F)
# 
#     lmses[,paste(as.character(alpha), perm, "shuffled")] <- weightedLASSO_inference_MSE(counts, genes, tfs, alpha = alpha, N=50,
#                            pwm_occurrence = pwm_occurrence, nCores = 40, tf_expression_permutation = T)
#   }
# }
# save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")

Getting the importance metrics of inferred edges in order to measure effective data integration and build inferred GRNs

For weightedRF

nCores = 45
mats <- list()
nrep <- 100
for(alpha in ALPHAS){ # exploring PWM integration strength
  for(rep in 1:nrep){ # exploring inherent variability
    
    mat_rf <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alpha,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
    
    mat_rf_perm <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alpha, tf_expression_permutation = TRUE,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
    
    mats[[paste0("bRF_", as.character(alpha),  '_trueData_', rep)]] <- mat_rf
    mats[[paste0("bRF_", as.character(alpha),  '_shuffled_', rep)]] <- mat_rf_perm
    
  }
}
save(mats, file = "results/100_permutations_bRF_importances_inf.rdata")

# thresholds the regulatory weights at a certain density to build GRNs
edges <- list()
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){ 
      for(density in densities){# exploring importance threshold stringency
        edges[[paste0(name, '_', density)]] <-
        weightedRF_network(mats[[name]], density = density, pwm_occurrence, genes, tfs)
      }
}
save(edges, file = "results/100_permutations_bRF_edges_inf.rdata")

# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 35,
    input_genes = genes,
    input_tfs = tfs
  )
val_dap[, settings] <-
  str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_rf_validation_dap_inf.rdata")

For weightedLASSO

nCores = 40
# mats <- list()
nrep <- 50
for(alpha in c(1)){ # exploring PWM integration strength
  for(rep in 1:nrep){ # exploring inherent variability
    
    mat_lasso <- weightedLASSO_inference(counts = counts, genes = genes, tfs = tfs,
                                     alpha = alpha, N = 50, 
                                     tf_expression_permutation = FALSE,
                                     int_pwm_noise = 0, mda_type = "shuffle",
                                     pwm_occurrence = pwm_occurrence,
                                     nCores = nCores)
    
    mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
                                     alpha = alpha, N = 50, 
                                     tf_expression_permutation = TRUE,
                                     int_pwm_noise = 0, mda_type = "shuffle",
                                     pwm_occurrence = pwm_occurrence,
                                     nCores = nCores)
    
    mats[[paste0("LASSO_", as.character(alpha),  '_trueData_', rep)]] <- mat_lasso
    mats[[paste0("LASSO_", as.character(alpha),  '_shuffled_', rep)]] <- mat_lasso_perm
    
  }
}

# grn <- weightedLASSO_network(mat_lasso, density = 0.005, 
#                              pwm_occurrence = pwm_occurrence, 
#                              genes = genes, tfs = tfs, decreasing = T)
# median(mat_lasso["mse",])
# 
# mean(grn$pwm)
# evaluate_network(grn, genes, tfs, validation = "DAPSeq")[c("tpr", "recall")]
# 
# median(mats$LASSO_1_trueData_1["mse",])

save(mats, file = "results/100_permutations_lasso_mda_shuffle.rdata")

# thresholds the regulatory weights at certain densities to build GRNs
edges <- list()
lmses <- data.frame(row.names = genes)
densities <- c(0.005, 0.01,0.05)
for(name in names(mats)){ 
      for(density in densities){# exploring importance threshold stringency
        edges[[paste0(name, '_', density)]] <-
        weightedLASSO_network(mats[[name]], density = density, pwm_occurrence, 
                          genes, tfs, decreasing = TRUE)
        
        lmses[,name] <- mats[[name]]["mse",]
      }
  
}
save(edges, file = "results/100_permutations_lasso_edges.rdata")
save(lmses, file = "results/lasso_perumtations_mse_all_genes_test.rdata")


# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_dap <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 35,
    input_genes = genes,
    input_tfs = tfs
  )
val_dap[, settings] <-
  str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_lasso_validation_dap.rdata")

Showing the MSE and effective data integration behaviors

# prior values
pwm_imputed <- pwm_occurrence
pwm_imputed[is.na(pwm_imputed)] <- 0.5

# parallel sapply to parallelise the computing of optimal alphas
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {
  FUN <- match.fun(FUN)
  answer <- parallel::mclapply(X = X, FUN = FUN, ...)
  if (USE.NAMES && is.character(X) && is.null(names(answer))) 
    names(answer) <- X
  if (!isFALSE(simplify) && length(answer)) 
    simplify2array(answer, higher = (simplify == "array"))
  else answer
}

weightedRF

# importance metrics and MSE for weightedRF
load("results/brf_perumtations_mse_all_genes_predict.rdata")
load("results/100_permutations_bRF_importances_inf.rdata")


# needs an appropriate mats variable and a lmses variable to be loaded

# plots the importance of TFs that have a specific prior value, here one,
# as alpha is increased
draw_gene_effective_integration <- function(gene, prior=1, type = "rank", without_1 = F){
  tfs_with_motif <- names(which(pwm_imputed[gene,]== prior))
  if(type == "rank")
    data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
  if(type == "imp")
    data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
  data <- data %>%
  gather(key = "setting", value = "summed_importance") %>%
  separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_")
  
  if(without_1)
    data <- data %>%
    filter(alpha != "1")
  
  plot <- data %>%
    group_by(alpha, dataset) %>%
  mutate(mean_imp = mean(summed_importance),
            sd_imp = sd(summed_importance),
         alpha = as.numeric(alpha))%>%
  ggplot(aes(x=alpha, y = summed_importance, color = dataset, fill = dataset)) +
  geom_point(alpha = 0.1) + geom_line(aes(y=mean_imp))+
  geom_ribbon(aes(ymin = mean_imp - sd_imp , 
                    ymax = mean_imp + sd_imp  ), 
                alpha = .4)  +theme_pubr(legend = "none")+
    ylab("Rank of TFs with Pi=1")+
    ggtitle("Effective data integration") + 
    labs(subtitle = gene)+
    scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
  plot + xlab(expression(alpha))
}


# plots the MSE as alpha is increased
draw_gene_mse <- function(gene, title = NULL, without_1 = F){
  data <-
    lmses[gene, ] %>%
    gather() %>%
    separate(key,
             into = c("model", "alpha", "dataset",  "rep"),
             sep = "_") 
  if(without_1)
    data <- data %>%
    filter(alpha != "1")
  
  data %>%
    group_by(alpha, dataset) %>%
    mutate(mean_mse = mean(value, na.rm = T),
           sd_mse = sd(value, na.rm = T)) %>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = value,
      color = dataset,
      fill = dataset
    )) +ggtitle(paste("MSE"))+ylab("MSE/Var(gene)")+
    geom_ribbon(aes(ymin = mean_mse - sd_mse , 
                    ymax = mean_mse + sd_mse  ), 
                alpha = .4)  +theme_pubr(legend = "top")+
    geom_point(alpha = 0.1) + geom_line(aes(y=mean_mse))+xlab(expression(alpha))+ 
    scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))
}


# plots the MSE as a function of effective data integration
get_opt_alpha_per_gene <- function(gene, type = "rank", 
                                   return_cluster = F, metric = 'div'){
  tfs_with_motif <- names(which(pwm_occurrence[gene,]==1))
  # gene with no TFBS has an optimal alpha of 0
  if(return_cluster & length(tfs_with_motif)==0) return(0)
  # else
  if(length(tfs_with_motif)>0){
    
  if(type == "rank")
    data <- data.frame(lapply(mats, function(mat){mean(rank(mat[,gene])[tfs_with_motif])}))
  if(type == "imp")
    data <- data.frame(lapply(mats, function(mat){mean(mat[,gene][tfs_with_motif])}))
  
  inte <- data %>%
  gather(key = "setting", value = "summed_importance") %>%
  separate(setting, into = c("method", "alpha", "dataset", "rep"), sep = "_") %>%
  group_by(alpha, dataset) %>%
  mutate(mean_imp = mean(summed_importance, na.rm=T),
            sd_imp = sd(summed_importance, na.rm=T),
         alpha = as.numeric(alpha))
  
  
  curves <- lmses[gene, ] %>%
    gather() %>%
   separate(key,
             into = c("model", "alpha", "dataset",  "rep"),
             sep = "_") %>%
    group_by(alpha, dataset) %>%
    mutate(mean_mse = mean(value, na.rm = T),
           sd_mse = sd(value, na.rm = T)) %>%
    mutate(alpha = as.numeric(alpha))%>%
    full_join(inte, by = c("alpha", "dataset", "rep")) %>%
    group_by(alpha, mean_imp, dataset) %>%
    summarise(mean_mse = mean(value, na.rm = T),
           sd_mse = sd(value, na.rm = T)) 
  
  curves <- curves %>%
  group_by(dataset) %>%
    arrange(dataset, mean_imp)%>%
      mutate(imps=curves[curves$dataset=="trueData", ]$mean_imp) %>%
      mutate(approx_mse = approx(mean_imp,mean_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y,
             approx_sd = approx(mean_imp,sd_mse,curves[curves$dataset=="trueData", ]$mean_imp, rule=2)$y) 
  
  
  true <- curves[curves$dataset=="trueData",]
  shuff <- curves[curves$dataset!="trueData",]
  
  # true$div <- (shuff$approx_mse - shuff$approx_sd) - (true$mean_mse)
  true$div <- ifelse((shuff$approx_mse - true$mean_mse)/shuff$approx_sd>1, 
                     (shuff$approx_mse - true$mean_mse)/shuff$approx_sd, 
                     0)
  
  if(metric == "div"){
    if(max(true$div)>0) alpha_opt <- true[true$div == max(true$div),]$alpha
    else alpha_opt <- 0
  }
  
  if(metric == "min"){
    alpha_opt <- true[true$mean_mse == min(true$mean_mse),]$alpha
  }
    
  eff_opt = true[true$alpha==alpha_opt,]$mean_imp
  
  padding = ifelse(type=="rank", ifelse(alpha_opt<0.2, 17, -17), 0)
  if(return_cluster) return(alpha_opt)
  
  curves%>%
    ggplot(aes(y=mean_mse, x = mean_imp, color = dataset, fill = dataset))+
  # geom_ribbon(aes(ymin =mean_mse-sd_mse , 
  #                 ymax = mean_mse + sd_mse),  alpha = .4)+ 
    geom_ribbon(aes(x=imps,ymin =approx_mse-approx_sd ,
                  ymax = approx_mse + approx_sd), alpha = .25)+
    geom_point(alpha = 0.1, size = 0.5) +
    geom_line(aes(x=mean_imp, y = mean_mse), size=1) + 
      # geom_line(aes(x=imps, y = approx_mse), col="black")+
  theme_pubr(legend = "none") +
    ylab("MSE/Var(gene)") + xlab("Effective data integration") + ggtitle("MSE=f(effective integration)")+ 
    scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    geom_vline(xintercept = eff_opt, size = 2, col="#4670CD") +
  annotate("text", x=eff_opt+padding, y=max(shuff$mean_mse), 
           label=paste("Optimal\nintegration:\nalpha =", alpha_opt), 
           angle=0, col = "#4670CD", size =3.5 )
  }
  else ggplot()
}


gene_no_benefit <- "AT1G30270"
gene_benefit_optimum <- "AT1G14720"
gene_benefit <- "AT5G48970"

n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
  draw_gene_mse(gene_no_benefit)+
  get_opt_alpha_per_gene(gene_no_benefit)+
  plot_annotation(title = gene_no_benefit) 
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
  draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(gene_benefit_optimum)+
  plot_annotation(title = gene_benefit_optimum) 
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
  draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(gene_benefit)+
  plot_annotation(title = gene_benefit)


figure <- n/o/p;figure

ggexport(figure, filename = "results/gene_examples_weightedRF.pdf", width = 10, height = 9)
# value of alpha per genes:
alphas_rf <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank", 
                          return_cluster=T, mc.cores=34)
hist(alphas_rf)
save(alphas_rf, file = "results/alpha_per_gene_weighted_RF.rdata")

alphas_rf <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank", metric = "min",
                          return_cluster=T, mc.cores=34)
hist(alphas_rf)
save(alphas_rf, file = "results/alpha_per_gene_weighted_RF_min.rdata")
#turns old mse of RF into new format returned by inference
colnames(lmses) <- 
  paste0("RF", "_", str_split_fixed(colnames(lmses), ' ', 3)[,1], '_', 
       str_split_fixed(colnames(lmses), ' ', 3)[,3], "_", 
       str_split_fixed(colnames(lmses), ' ', 3)[,2]) %>%
  str_replace("_data", "Data")
save(lmses, file = "results/brf_perumtations_mse_all_genes_predict.rdata")

Doing the same for weightedLASSO:

# for the lasso
load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/100_permutations_lasso_mda_shuffle.rdata")
# load("results/100_permutations_lasso_edges.rdata")

# gene_no_benefit <- "AT5G66850"
# # the same as RFs
# gene_benefit_optimum <- "AT1G14720"
# gene_benefit <- "AT2G38180"


n <- draw_gene_effective_integration(gene_no_benefit, prior = 1) +
  draw_gene_mse(gene_no_benefit)+
  get_opt_alpha_per_gene(gene_no_benefit)+
  plot_annotation(title = gene_no_benefit) 
o <- draw_gene_effective_integration(gene_benefit_optimum, prior = 1) +
  draw_gene_mse(gene_benefit_optimum)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(gene_benefit_optimum)+
  plot_annotation(title = gene_benefit_optimum) 
p <- draw_gene_effective_integration(gene_benefit, prior = 1) +
  draw_gene_mse(gene_benefit)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(gene_benefit)+
  plot_annotation(title = gene_benefit)


n/o/p

figure <- n/o/p
ggexport(figure, filename = "results/gene_examples_weightedLASSO.pdf", width = 10, height = 9)


# nrt <- sample(int, size = 1)
# draw_gene_effective_integration(nrt, prior = 1, type = "imp")
# draw_gene_effective_integration(nrt, prior = 0.5, type = "imp")
# draw_gene_effective_integration(nrt, prior = 0, type = "imp")
# get_opt_alpha_per_gene(nrt, type = "imp")
# value of alpha per genes:
alphas_lasso <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank", 
                          return_cluster=T, mc.cores=34)
hist(alphas_lasso)
save(alphas_lasso, file = "results/alpha_per_gene_weighted_LASSO_test.rdata")

alphas_lasso <- mcsapply(genes, get_opt_alpha_per_gene, type = "rank", 
                          return_cluster=T, metric = "min", mc.cores=34)
hist(alphas_lasso)
save(alphas_lasso, file = "results/alpha_per_gene_weighted_LASSO_min.rdata")

For divergence from permutation criteria :

load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")

data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
  gather(key = "model", value = "alpha") %>%
ggplot(aes(x = alpha, fill = model)) +
  geom_histogram()+
  facet_wrap(~model)+
  theme(axis.title.y = element_blank(), legend.position = "none")+
  scale_fill_manual( values = c("#C55A11","#E67F87"))+theme_pubr()

p<-data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
  rownames_to_column("gene") %>%
  mutate(gene_type=gene %in% tfs) %>%
  ggplot(aes(x=weightedLASSO, y=weightedRF)) +
  geom_point(alpha = 0)+
  geom_bin2d(bins=11)+ 
  scale_fill_gradient(name = "count", trans = "log",
                      low = "white", high = "#4670CD",
                      breaks = c(2,10,50,250,700), 
                      labels = c(2,10,50,250,700))+
  theme_pubclean() + theme(legend.position = "right", 
                           axis.text = element_text(size = 15),
                           axis.title = element_text(size = 20))

ggExtra::ggMarginal(p, type = "histogram", fill = "#C55A11", size = 1)

For minimum MSE criteria :

load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")

data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
  gather(key = "model", value = "alpha") %>%
ggplot(aes(x = alpha, fill = model)) +
  geom_histogram()+
  facet_wrap(~model)+
  theme(axis.title.y = element_blank(), legend.position = "none")+
  scale_fill_manual( values = c("#C55A11","#E67F87"))+theme_pubr()

p<-data.frame(weightedLASSO = alphas_lasso, weightedRF = alphas_rf) %>%
  rownames_to_column("gene") %>%
  mutate(gene_type=gene %in% tfs) %>%
  ggplot(aes(x=weightedLASSO, y=weightedRF)) +
  geom_point(alpha = 0)+
  geom_bin2d(bins=11)+ 
  scale_fill_gradient(name = "count", trans = "log",
                      low = "white", high = "#4670CD",
                      breaks = c(2,10,50,250,700), 
                      labels = c(2,10,50,250,700))+
  theme_pubclean() + theme(legend.position = "right", 
                           axis.text = element_text(size = 15),
                           axis.title = element_text(size = 20))

ggExtra::ggMarginal(p, type = "histogram", fill = "#C55A11", size = 1)

Differences between gene lists for the two criteria

For the lasso :

load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")

alphas_lasso_min <- alphas_lasso
alphas_rf_min <- alphas_rf

load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")

alphas_lasso_div <- alphas_lasso
alphas_rf_div <- alphas_rf

saved_lasso <- setdiff(names(alphas_lasso_div[alphas_lasso_div > 0]), 
                             names(alphas_lasso_min[alphas_lasso_min > 0]))

saved_rf <- setdiff(names(alphas_rf_div[alphas_rf_div > 0]), 
                             names(alphas_rf_min[alphas_rf_min > 0]))

same_rf <- intersect(names(alphas_rf_div[alphas_rf_div > 0]), 
                             names(alphas_rf_min[alphas_rf_min > 0]))

lost_lasso <- setdiff(names(alphas_lasso_min[alphas_lasso_min > 0]),
                            names(alphas_lasso_div[alphas_lasso_div > 0]))

lost_rf <- setdiff(names(alphas_rf_min[alphas_rf_min > 0]),
                         names(alphas_rf_div[alphas_rf_div > 0]))

same_lasso <- intersect(names(alphas_lasso_min[alphas_lasso_min > 0]),
                        names(alphas_lasso_div[alphas_lasso_div > 0]))

matrix(c(length(same_lasso), length(lost_lasso), length(saved_lasso), 
         length(genes) - length(same_lasso) - length(lost_lasso) - length(saved_lasso)), 
       nrow = 2, byrow = F, dimnames = list(c("data integration div", "no data integration div"),
                                            c("data integration min", "no data integration min")))
##                         data integration min no data integration min
## data integration div                     445                      24
## no data integration div                  585                     372

For the RFs :

matrix(c(length(same_rf), length(lost_rf), length(saved_rf), 
         length(genes) - length(same_rf) - length(lost_rf) - length(saved_rf)), 
       nrow = 2, byrow = F, dimnames = list(c("data integration div", "no data integration div"),
                                            c("data integration min", "no data integration min")))
##                         data integration min no data integration min
## data integration div                     351                       3
## no data integration div                  388                     684

examples at alpha optimal at 0.9

lasso_1<- names(alphas_lasso[alphas_lasso==0.9])
subset <- sample(lasso_1, size = 8, replace = F)

wo1 <- F 
for(nrt in subset ){
  print(draw_gene_effective_integration(nrt, prior = 1) +
  draw_gene_mse(nrt)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(nrt)+
  plot_annotation(title = nrt))
}

examples of alpha optimal at 1 for lasso and rf

both1<-c("AT3G16560", "AT5G48970" ,"AT3G16350", "AT1G22500" ,"AT3G06110" ,"AT2G46790" ,"AT5G52860")
for(nrt in both1){
  print(draw_gene_effective_integration(nrt, prior = 1) +
  draw_gene_mse(nrt)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(nrt)+
  plot_annotation(title = nrt))
}

load("results/brf_perumtations_mse_all_genes_predict.rdata")
load("results/100_permutations_bRF_importances_inf.rdata")
both1<-c("AT3G16560", "AT5G48970" ,"AT3G16350", "AT1G22500" ,"AT3G06110" ,"AT2G46790" ,"AT5G52860")
for(nrt in both1){
  print(draw_gene_effective_integration(nrt, prior = 1) +
  draw_gene_mse(nrt)+theme(legend.position = "none")+
  get_opt_alpha_per_gene(nrt)+
  plot_annotation(title = nrt))
}

load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/100_permutations_lasso_mda_shuffle.rdata")

Examples of rank and mean importance in effective integration

In alphas opt at 0.9

for(nrt in subset){
  print(draw_gene_effective_integration(nrt, prior = 1) + ggtitle("Pi=1, rank")+
          draw_gene_effective_integration(nrt, prior = 1, type="imp") +ggtitle("Pi=1")+
                  draw_gene_effective_integration(nrt, prior = 0.5) +ggtitle("Pi=1/2, rank")+
          draw_gene_effective_integration(nrt, prior = 0.5, type="imp") +ggtitle("Pi=1/2")+
                  draw_gene_effective_integration(nrt, prior = 0) +ggtitle("Pi=0, rank")+
          draw_gene_effective_integration(nrt, prior = 0, type="imp") +ggtitle("Pi=0")+
  get_opt_alpha_per_gene(nrt)+
    get_opt_alpha_per_gene(nrt, type = "imp")+
  plot_annotation(title = nrt)+plot_layout(ncol=4))
}

Comparison with the forests (is it the same order of mangitude for variable importance?)

load("results/brf_perumtations_mse_all_genes_predict.rdata")
load("results/100_permutations_bRF_importances_inf.rdata")
wo1 <- F
for(nrt in subset){
  print(draw_gene_effective_integration(nrt, prior = 1) + ggtitle("Pi=1, rank")+
          draw_gene_effective_integration(nrt, prior = 1, type="imp") +ggtitle("Pi=1")+
                  draw_gene_effective_integration(nrt, prior = 0.5) +ggtitle("Pi=1/2, rank")+
          draw_gene_effective_integration(nrt, prior = 0.5, type="imp") +ggtitle("Pi=1/2")+
                  draw_gene_effective_integration(nrt, prior = 0) +ggtitle("Pi=0, rank")+
          draw_gene_effective_integration(nrt, prior = 0, type="imp") +ggtitle("Pi=0")+
  get_opt_alpha_per_gene(nrt)+
    get_opt_alpha_per_gene(nrt, type = "imp")+
  plot_annotation(title = nrt)+plot_layout(ncol=4))
}
load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/100_permutations_lasso_mda_shuffle.rdata")

Plotting MSE behaviour for all genes

Depending on their class (optimal alpha different from 0 or not).

weightedRF

load("results/brf_perumtations_mse_all_genes_predict.rdata")
load(file = "results/alpha_per_gene_weighted_RF.rdata")

pos_class <- names(alphas_rf[alphas_rf!=0])

mse <- lmses[str_detect(colnames(lmses), "trueData")]
for(alpha in seq(0,1, by = 0.1)){
  mse[,paste("alpha",alpha)] <- rowMeans(mse[,str_detect(colnames(mse), paste0("RF_",as.character(alpha), "_"))])
}
mse <- as.matrix(mse[str_detect(colnames(mse), "alpha")])
mse_interest <- mse[pos_class,]
mse_other <- mse[setdiff(rownames(mse), pos_class),]

library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), hcl_palette = "Blue-Red 3")

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(str_remove(colnames(mse), "alpha "))),
    annotation_name_side = "left")

#pdf("results/rf_mse_interest.pdf", height = 5)
Heatmap((mse_interest-rowMeans(mse_interest))/matrixStats::rowSds(mse_interest), 
        col = col_fun, show_column_names = F,
         width = ncol(mse_interest)*unit(10, "mm"), 
      height = nrow(mse_interest)*unit(0.2, "mm"),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(class = ifelse(alphas_rf[rownames(mse_interest)]>0, 
                               "class of interest", "no integration"), 
                col = list(class = setNames(c("#70AD47", "grey"), 
                                            nm = c("class of interest", "no integration"))))

#dev.off()
#pdf("results/rf_mse_others.pdf", height = 10)
Heatmap((mse_other-rowMeans(mse_other))/matrixStats::rowSds(mse_other),
        col = col_fun,show_column_names = F,
        width = ncol(mse_other)*unit(10, "mm"), 
      height = nrow(mse_other)*unit(0.2, "mm"),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(class = ifelse(alphas_rf[rownames(mse_other)]>0, 
                               "class of interest", "no integration"), 
                col = list(class = setNames(c("#70AD47", "grey"), 
                                            nm = c("class of interest", "no integration"))))

#dev.off()

For the lasso

load("results/lasso_perumtations_mse_all_genes_test.rdata")
load("results/alpha_per_gene_weighted_LASSO_test.rdata")

pos_class <- names(alphas_lasso[alphas_lasso!=0])

mse <- lmses[str_detect(colnames(lmses), "trueData")]
for(alpha in seq(0,1, by = 0.1)){
  mse[,paste("alpha",alpha)] <- rowMeans(mse[,str_detect(colnames(mse), paste0("LASSO_",as.character(alpha), "_"))])
}
mse <- as.matrix(mse[str_detect(colnames(mse), "alpha")])
mse_interest <- mse[pos_class,]
mse_other <- mse[setdiff(rownames(mse), pos_class),]

library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), hcl_palette = "Blue-Red 3")

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(str_remove(colnames(mse), "alpha "))),
    annotation_name_side = "left")

#pdf("results/lasso_mse_interest.pdf", height = 7)
Heatmap((mse_interest-rowMeans(mse_interest))/matrixStats::rowSds(mse_interest), 
        col = col_fun, show_column_names = F,
         width = ncol(mse_interest)*unit(10, "mm"), 
      height = nrow(mse_interest)*unit(0.2, "mm"),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_interest)]>0, 
                               "class of interest", "no integration"), 
                col = list(class = setNames(c("#70AD47", "grey"), 
                                            nm = c("class of interest", "no integration"))))

#dev.off()
#pdf("results/lasso_mse_others.pdf", height = 7)

Heatmap((mse_other-rowMeans(mse_other))/matrixStats::rowSds(mse_other),
        col = col_fun,show_column_names = F,
        width = ncol(mse_other)*unit(10, "mm"), 
      height = nrow(mse_other)*unit(0.2, "mm"),
        cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
  rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_other)]>0, 
                               "class of interest", "no integration"), 
                col = list(class = setNames(c("#70AD47", "grey"), 
                                            nm = c("class of interest", "no integration"))))

#dev.off()



# 
# 
# Heatmap((mse_interest-rowMeans(mse_interest))/matrixStats::rowSds(mse_interest), 
#         col = col_fun, show_column_names = F,
#          width = ncol(mse_interest)*unit(10, "mm"), 
#       height = nrow(mse_interest)*unit(0.2, "mm"),
#         cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
#   rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_interest)]>0, 
#                                "class of interest", "no integration"), 
#                 col = list(class = setNames(c("#70AD47", "grey"), 
#                                             nm = c("class of interest", "no integration"))))+
#   Heatmap(mse_interest, 
#         col = col_fun, show_column_names = F,
#          width = ncol(mse_interest)*unit(10, "mm"), 
#       height = nrow(mse_interest)*unit(0.2, "mm"),
#         cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
#   rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_interest)]>0, 
#                                "class of interest", "no integration"), 
#                 col = list(class = setNames(c("#70AD47", "grey"), 
#                                             nm = c("class of interest", "no integration"))))
# 
# Heatmap((mse_other-rowMeans(mse_other))/matrixStats::rowSds(mse_other),
#         col = col_fun,show_column_names = F,
#         width = ncol(mse_other)*unit(10, "mm"), 
#       height = nrow(mse_other)*unit(0.2, "mm"),
#         cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
#   rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_other)]>0, 
#                                "class of interest", "no integration"), 
#                 col = list(class = setNames(c("#70AD47", "grey"), 
#                                             nm = c("class of interest", "no integration"))))+
#   Heatmap(mse_other,
#         col = col_fun,show_column_names = F,
#         width = ncol(mse_other)*unit(10, "mm"), 
#       height = nrow(mse_other)*unit(0.2, "mm"),
#         cluster_columns = F, show_row_names = F, top_annotation = ha)+ 
#   rowAnnotation(class = ifelse(alphas_lasso[rownames(mse_other)]>0, 
#                                "class of interest", "no integration"), 
#                 col = list(class = setNames(c("#70AD47", "grey"), 
#                                             nm = c("class of interest", "no integration"))))

Intersection between classes

Divergence from perm :

load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")

venn <- ggVennDiagram(list("weightedRF" = names(alphas_rf[alphas_rf!=0]),
                   "weightedLASSO" = names(alphas_lasso[alphas_lasso!=0])), edge_size = 2)+ 
  scale_color_manual( values = c("#C55A11","#E67F87"))+
  scale_fill_gradient(high="#e6f2ff", low="#e6f2ff")+
  theme(legend.position = "none");venn

# hypergeometric test to assess the significance of the intersect
p_enrich <- phyper(q=220, m = length(names(alphas_rf[alphas_rf!=0])), 
                   n = length(genes) - length(names(alphas_rf[alphas_rf!=0])), 
                   k = length( names(alphas_lasso[alphas_lasso!=0])), lower.tail = F)
p_enrich
## [1] 1.442015e-40
ggexport(venn, filename = "results/classes_intersection.pdf", width = 4, height = 4)

Min MSE :

load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")

venn <- ggVennDiagram(list("weightedRF" = names(alphas_rf[alphas_rf!=0]),
                   "weightedLASSO" = names(alphas_lasso[alphas_lasso!=0])), edge_size = 2)+ 
  scale_color_manual( values = c("#C55A11","#E67F87"))+
  scale_fill_gradient(high="#e6f2ff", low="#e6f2ff")+
  theme(legend.position = "none");venn

# hypergeometric test to assess the significance of the intersect
p_enrich <- phyper(q=220, m = length(names(alphas_rf[alphas_rf!=0])), 
                   n = length(genes) - length(names(alphas_rf[alphas_rf!=0])), 
                   k = length( names(alphas_lasso[alphas_lasso!=0])), lower.tail = F)
p_enrich
## [1] 1
ggexport(venn, filename = "results/classes_intersection_min.pdf", width = 4, height = 4)

Motif enrichment in gene lists

load("results/alpha_per_gene_weighted_LASSO_test.rdata")
load("results/alpha_per_gene_weighted_RF.rdata")
load("rdata/pwm_prom_jaspar_dap.rdata")

pos_class_rf <- names(alphas_rf[alphas_rf!=0])
pos_class_lasso <- names(alphas_lasso[alphas_lasso!=0])
common <- intersect(pos_class_lasso, pos_class_rf)

known_tfs <- tfs[which(tfs %in% pwm_prom$TF)]

get_number_of_motifs_per_tfs <- function(genes){
  table(pwm_prom[pwm_prom$target %in% genes & pwm_prom$TF %in% tfs,"TF"])[known_tfs]
}


gene_list <- pos_class_rf
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)

for(tf in known_tfs){
  
  # number of genes with that motif in all genes
  n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
  
  # number of genes with that motif in the lasso group
  n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
  p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
         m=n_targets_in_all_tf, #white balls
         n=length(genes)-n_targets_in_all_tf, # black balls
         k=n_group_lasso, lower.tail = FALSE)
  
  
  enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")
gene_list <- pos_class_lasso
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)

for(tf in known_tfs){
  
  # number of genes with that motif in all genes
  n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
  
  # number of genes with that motif in the lasso group
  n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
  p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
         m=n_targets_in_all_tf, #white balls
         n=length(genes)-n_targets_in_all_tf, # black balls
         k=n_group_lasso, lower.tail = FALSE)
  
  
  enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")
gene_list <- common
enrichments_per_pwm <- setNames(rep(1, length(known_tfs)), known_tfs)
n_targets_lasso_in_all <- get_number_of_motifs_per_tfs(genes)
n_targets_lasso_in_group <- get_number_of_motifs_per_tfs(gene_list)
n_group_lasso <- length(gene_list)

for(tf in known_tfs){
  
  # number of genes with that motif in all genes
  n_targets_in_all_tf <- n_targets_lasso_in_all[tf]
  
  # number of genes with that motif in the lasso group
  n_targets_lasso_in_group_tf <- n_targets_lasso_in_group[tf]
  p_lasso <- phyper(q=n_targets_lasso_in_group_tf-1,
         m=n_targets_in_all_tf, #white balls
         n=length(genes)-n_targets_in_all_tf, # black balls
         k=n_group_lasso, lower.tail = FALSE)
  
  
  enrichments_per_pwm[tf]<- p_lasso
}
enriched_tfs <- names(which(enrichments_per_pwm < 0.05))
DIANE::get_gene_information(enriched_tfs, organism = "Arabidopsis thaliana")

Precision curves

draw_validation <- function(validation, densities = c(0.005,0.01,0.05), precision_only = F){
  data_val <- validation %>%
    filter(density %in% densities)%>%
    group_by(model, alpha, dataset, density) %>%
    mutate(mean_precision = mean(precision, na.rm = T),
           sd_precision = sd(precision, na.rm = T),
           mean_recall = mean(recall, na.rm = T),
           sd_recall = sd(recall, na.rm = T),
           density = paste("D =", density),
           model = str_replace(model, "bRF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO")) %>%
  dplyr::select(-network_name) %>%
  mutate(alpha = as.numeric(alpha))

precision <- data_val %>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = precision,
      color = dataset,
      fill = dataset
    ))+
    ggh4x::facet_nested_wrap(vars(density), ncol = 8, nest_line = T) + 
    geom_ribbon(aes(ymin = mean_precision - sd_precision , 
                    ymax = mean_precision + sd_precision  ), 
                alpha = .4) + theme_pubr(legend = "top")+
    geom_point(alpha = 0.1) +
  xlab(expression(alpha)) + ylab("Precision") +
  ggtitle(paste("Precision against DAP-Seq")) +
  geom_line(aes(group = dataset, y = mean_precision)) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  )+scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T)

recall <- data_val %>%
    ggplot(aes(
      x = as.numeric(alpha),
      y = recall,
      color = dataset,
      fill = dataset
    ))+
    ggh4x::facet_nested_wrap(vars(density), ncol = 8, nest_line = T, scales = "free") + 
    geom_ribbon(aes(ymin = mean_recall - sd_recall , 
                    ymax = mean_recall + sd_recall  ), 
                alpha = .4)  +theme_pubr(legend = "top")+
    geom_point(alpha = 0.1) + xlab("alpha") +
  xlab(expression(alpha)) + ylab("Recall") +
  geom_line(aes(group = dataset, y = mean_recall)) +
  ggtitle(paste("Recall against DAPSeq")) +
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_text(size = 12),
    legend.position = 'top'
  )+scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))

if(precision_only)
  return(precision)
else
  return(precision/recall)
}
load(file = "results/100_permutations_bRF_validation_dap_inf.rdata")
val_rf <- val_dap
load(file = "results/100_permutations_lasso_validation_dap.rdata")
val_lasso <- val_dap

draw_validation(validation = val_lasso)

draw_validation(validation = val_rf)

Gene specific GRNs

# load("results/alpha_per_gene_weighted_LASSO_test.rdata")
# load("results/alpha_per_gene_weighted_RF.rdata")


load("results/alpha_per_gene_weighted_LASSO_min.rdata")
load("results/alpha_per_gene_weighted_RF_min.rdata")

hist(alphas_lasso)
hist(alphas_rf)

nCores = 40
mats <- list()
nrep <- 10
for(rep in 1:nrep){ # exploring inherent variability
  
  mat_lasso <- weightedLASSO_inference(counts, genes, tfs,
                                   alpha = alphas_lasso, N = 50,
                                   tf_expression_permutation = FALSE,
                                   int_pwm_noise = 0, mda_type = "shuffle",
                                   pwm_occurrence = pwm_occurrence,
                                   nCores = nCores)

  mat_lasso_perm <- weightedLASSO_inference(counts, genes, tfs,
                                   alpha = alphas_lasso, N = 50,
                                   tf_expression_permutation = TRUE,
                                   int_pwm_noise = 0, mda_type = "shuffle",
                                   pwm_occurrence = pwm_occurrence,
                                   nCores = nCores)

  mats[[paste0("LASSO_trueData_", rep)]] <- mat_lasso
  mats[[paste0("LASSO_shuffled_", rep)]] <- mat_lasso_perm
  
   mat_rf <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alphas_rf,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
    
    mat_rf_perm <- weightedRF_inference(counts, genes, tfs, nTrees = 2000,
                            alpha = alphas_rf, 
                            tf_expression_permutation = TRUE,
                            pwm_occurrence = pwm_occurrence,
                            nCores = nCores,
                            importance = "%IncMSE")
  
  mats[[paste0("RF_trueData_", rep)]] <- mat_rf
  mats[[paste0("RF_shuffled_", rep)]] <- mat_rf_perm
}
save(mats, file = "results/gene_specific_grns_min.rdata")


edges <- list()
lmses <- data.frame(row.names = genes)
for(name in names(mats)){ 
      for(density in c(0.005,0.01,0.05)){# exploring importance threshold stringency
        
        if(str_detect(name, "LASSO"))
          edges[[paste0(name, '_', density)]] <-
        weightedLASSO_network(mats[[name]], density = density, pwm_occurrence, 
                          genes, tfs, decreasing = TRUE)
        else
          edges[[paste0(name, '_', density)]] <-
        weightedRF_network(mats[[name]], density = density, pwm_occurrence, 
                          genes, tfs)
        lmses[,name] <- mats[[name]]["mse",]
      }
}
save(lmses, file = "results/gene_specific_mse_min.rdata") 
save(edges, file = "results/gene_specific_edges_min.rdata") 

For the max divergence to permutations metric

load("results/gene_specific_grns.rdata")
load("results/gene_specific_mse.rdata")
load("results/gene_specific_edges.rdata")

settings <- c("model", "dataset", "rep", "density")
# val_specific <-
#   evaluate_networks(
#     edges,
#     validation = c("DAPSeq"),
#     nCores = 30,
#     input_genes = genes,
#     input_tfs = tfs
#   )
# val_specific[, settings] <-
#   str_split_fixed(val_specific$network_name, '_', length(settings))
# save(val_specific, file = "results/gene_specific_validation_min.rdata")
load("results/gene_specific_validation.rdata")


precision_rf <- draw_validation(validation = val_rf, 
                                   densities = c(0.005), precision_only = T)
precision_lasso <- draw_validation(validation = val_lasso, 
                                   densities = c(0.005), precision_only = T)


spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  ) + xlab("") + ylim(c(0.2,0.46))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")
# gene specific MSE
mse_lasso_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
  separate(variable,
           into = c("model", "dataset",  "rep"),
           sep = "_") %>%
filter(model == "LASSO") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
  ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) + 
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
  scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
  strip.background = element_blank(),
  axis.title.x = element_text(size = 22),
  title = element_text(size = 16),
  axis.ticks.x = element_blank(),
  strip.text = element_text(size = 16),
  legend.text = element_text(size = 15),
  axis.text.x = element_blank(),
  legend.position = 'none'
)+ xlab("")+ ylab("MSE")

mse_rf_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
  separate(variable,
           into = c("model", "dataset",  "rep"),
           sep = "_") %>%
filter(model == "RF") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
  ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) + 
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
  scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
  strip.background = element_blank(),
  axis.ticks.x = element_blank(),
  axis.title.x = element_text(size = 22),
  title = element_text(size = 16),
  strip.text = element_text(size = 16),
  legend.text = element_text(size = 15),
  axis.text.x = element_blank(),
  legend.position = 'none'
)+ xlab("")+ ylab("MSE") 


# MSE for global alphas
draw_mse <- function(model){
data <- lmses[as.numeric(str_split_fixed(colnames(lmses), '_', 4)[,4]) <=10] %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
  separate(variable,
           into = c("model", "alpha", "dataset",  "rep"),
           sep = "_") %>%
filter(model == model) %>%
group_by(dataset, rep, alpha) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
  group_by(alpha, dataset) %>%
  mutate(mean_median_mse = mean(median_MSE),
         sd_median_mse = sd(median_MSE)) %>%
  mutate(alpha = as.numeric(alpha))
data %>%
  ggplot(aes(x=alpha, y = median_MSE, color = dataset, fill = dataset)) + 
scale_color_manual(values = setNames(c("grey", "#70AD47"),
                                     c("shuffled", "trueData")))+
  scale_fill_manual(values = setNames(c("grey", "#70AD47"), 
                                      c("shuffled", "trueData"))) +
  geom_line(aes(y=mean_median_mse, group = dataset)) +
  geom_ribbon(aes(ymin = mean_median_mse - sd_median_mse , 
                    ymax = mean_median_mse + sd_median_mse), 
                alpha = .4)+ xlab(expression(alpha)) +
theme_pubr() + geom_point(width = 0.2, size = 2)+
theme(
  strip.background = element_blank(),
  axis.title.x = element_text(size = 22),
  title = element_text(size = 16),
  strip.text = element_text(size = 16),
  legend.text = element_text(size = 15),
  legend.position = 'none'
)+ ylab("MSE")
}

load("results/lasso_perumtations_mse_all_genes_test.rdata")
mse_lasso <- draw_mse("LASSO")

load("results/brf_perumtations_mse_all_genes_predict.rdata")
mse_rf <- draw_mse("RF")
x_min <- 0.19
plot <- mse_lasso + ylim(c(x_min,0.265)) +labs(subtitle = "Global\ndata integration")+ 
  mse_lasso_spec+ ylim(c(x_min,0.265)) +labs(subtitle = "Gene-specific\ndata integration")+ylab("") +
  mse_rf +  ylim(c(x_min,0.265)) +labs(subtitle = "Global\ndata integration")+ ylab("") +
  mse_rf_spec+ ylim(c(x_min,0.265))  +labs(subtitle = "Gene-specific\ndata integration")+ylab("") +
  precision_lasso + ylim(c(0.2,0.46)) +ggtitle("")+ labs(subtitle = "Global\ndata integration")+
  theme(legend.position = 'none') + 
  spec_lasso + labs(subtitle = "Gene-specific\ndata integration")+
  precision_rf +labs(subtitle = "Global\ndata integration")+ ylab("") +ylim(c(0.2,0.46))+
  theme(legend.position = 'none')+ggtitle("") + 
  spec_rf +labs(subtitle = "Gene-specific\ndata integration")+
plot_layout(guides = "collect", ncol = 4, widths = c(1.5,1,1.5,1))& theme(legend.position = 'bottom');plot

ggexport(plot, filename = "results/specific_grns.pdf", width = 10, height = 8)

For the min MSE criteria

load("results/gene_specific_grns_min.rdata")
load("results/gene_specific_mse_min.rdata")
load("results/gene_specific_edges_min.rdata")

# settings <- c("model", "dataset", "rep", "density")
# val_specific <-
#   evaluate_networks(
#     edges,
#     validation = c("DAPSeq"),
#     nCores = 30,
#     input_genes = genes,
#     input_tfs = tfs
#   )
# val_specific[, settings] <-
#   str_split_fixed(val_specific$network_name, '_', length(settings))
load("results/gene_specific_validation_min.rdata")
# save(val_specific, file = "results/gene_specific_validation_min.rdata")


spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  ) + xlab("") + ylim(c(0.2,0.46))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")


mse_lasso_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
  separate(variable,
           into = c("model", "dataset",  "rep"),
           sep = "_") %>%
filter(model == "LASSO") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
  ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) + 
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
  scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
  strip.background = element_blank(),
  axis.title.x = element_text(size = 22),
  title = element_text(size = 16),
  strip.text = element_text(size = 16),
  legend.text = element_text(size = 15),
  axis.text.x = element_blank(),
  legend.position = 'none'
)+ xlab("")+ ylab("")

mse_rf_spec <- lmses %>%
rownames_to_column("gene") %>%
reshape2::melt()%>%
  separate(variable,
           into = c("model", "dataset",  "rep"),
           sep = "_") %>%
filter(model == "RF") %>%
group_by(dataset, rep) %>%
summarise(median_MSE = median(value, na.rm=T))%>%
  ggplot(aes(x=dataset, y = median_MSE, color = dataset, fill = dataset)) + 
scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
  scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
theme_pubr() + geom_jitter(width = 0.2, size = 2)+
theme(
  strip.background = element_blank(),
  axis.title.x = element_text(size = 22),
  title = element_text(size = 16),
  strip.text = element_text(size = 16),
  legend.text = element_text(size = 15),
  axis.text.x = element_blank(),
  legend.position = 'none'
)+ xlab("")+ ylab("") 


mse_lasso + ylim(c(0.17,0.265)) +labs(subtitle = "LASSO")+ ggtitle("MSE") + 
  mse_lasso_spec+ ylim(c(0.17,0.265)) +labs(subtitle = "LASSO")+
  mse_rf +  ylim(c(0.17,0.265)) +labs(subtitle = "RF") +
  mse_rf_spec+ ylim(c(0.17,0.265))  +labs(subtitle = "RF")+
  precision_lasso + ylim(c(0.2,0.46)) + ggtitle("Precision") + labs(subtitle = "LASSO")+
  theme(legend.position = 'none') + 
  spec_lasso + labs(subtitle = "LASSO")+
  precision_rf + labs(subtitle = "RF") +ylim(c(0.2,0.46))+theme(legend.position = 'none')+ggtitle("") + 
  spec_rf +labs(subtitle = "RF") +
plot_layout(guides = "collect", ncol = 4, widths = c(1,1,1,1)) 

Do genes lost by our metric have a lower precision?

Showing the precision of grns with min method and D=0.005

  • only on the genes not present with our divergence metric
  • for all genes
load("results/gene_specific_grns_min.rdata")
edges <- list()
# lmses <- data.frame(row.names = genes)
for(name in names(mats)){ 
      for(density in c(0.005)){# exploring importance threshold stringency
        
        if(str_detect(name, "LASSO"))
          edges[[paste0(name, '_', density)]] <-
        weightedLASSO_network(mats[[name]][,lost_lasso], density = density, pwm_occurrence, 
                          lost_lasso, tfs, decreasing = TRUE)
        else
          edges[[paste0(name, '_', density)]] <-
        weightedRF_network(mats[[name]][,lost_rf], density = density, pwm_occurrence, 
                          lost_rf, tfs)
        # lmses[,name] <- mats[[name]]["mse",]
      }
}



settings <- c("model", "dataset", "rep", "density")
val_specific_lost <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 30,
    input_genes = genes,
    input_tfs = tfs
  )
## 4.928 sec elapsed
val_specific_lost[, settings] <-
  str_split_fixed(val_specific_lost$network_name, '_', length(settings))

load("results/gene_specific_validation_min.rdata")

spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  ) + xlab("") + ylim(c(0.2,0.46))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")


spec_rf_lost <- val_specific_lost %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  ) + xlab("") + ylim(c(0.2,0.46))+ ylab("")

spec_lasso_lost <- val_specific_lost %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")



spec_lasso_lost + ggtitle("genes lost lasso") + spec_lasso + ggtitle("all genes lasso") +
spec_rf_lost + ggtitle("genes lost rf") +spec_rf+ ggtitle("all genes rf") + plot_layout(ncol = 4)

# save(val_specific, file = "results/gene_specific_validation_min.rdata")
# save(lmses, file = "results/gene_specific_mse_min.rdata") 
# save(edges, file = "results/gene_specific_edges_min.rdata") 

Showing the precision of grns with max div method and D=0.005

  • only on the genes not present with min mse method, gained by our metric
  • for all genes
load("results/gene_specific_grns.rdata")
edges <- list()
# lmses <- data.frame(row.names = genes)
for(name in names(mats)){ 
      for(density in c(0.005)){# exploring importance threshold stringency
        
        if(str_detect(name, "LASSO"))
          edges[[paste0(name, '_', density)]] <-
        weightedLASSO_network(mats[[name]][,saved_lasso], density = density, pwm_occurrence, 
                          saved_lasso, tfs, decreasing = TRUE)
        else
          edges[[paste0(name, '_', density)]] <-
        weightedRF_network(mats[[name]][,saved_rf], density = density, pwm_occurrence, 
                          saved_rf, tfs)
        # lmses[,name] <- mats[[name]]["mse",]
      }
}



settings <- c("model", "dataset", "rep", "density")
val_specific_gain <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 30,
    input_genes = genes,
    input_tfs = tfs
  )
## 5.487 sec elapsed
val_specific_gain[, settings] <-
  str_split_fixed(val_specific_gain$network_name, '_', length(settings))

load("results/gene_specific_validation.rdata")

spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,1))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,1))+ ylab("")


spec_rf_gain <- val_specific_gain %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,1))+ ylab("")

spec_lasso_gain <- val_specific_gain %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,1))+ ylab("")



spec_lasso_gain + ggtitle("saved lasso") + spec_lasso + ggtitle("all genes lasso") +
spec_rf_gain+ ggtitle("saved rf") +spec_rf+ ggtitle("all genes rf") + plot_layout(ncol = 4)

# save(val_specific, file = "results/gene_specific_validation_min.rdata")
# save(lmses, file = "results/gene_specific_mse_min.rdata") 
# save(edges, file = "results/gene_specific_edges_min.rdata") 

Showing the precision of grns with max div method and D=0.05

  • only on the genes not present with min mse method, gained by our metric
  • for all genes
load("results/gene_specific_grns.rdata")
edges <- list()
# lmses <- data.frame(row.names = genes)
for(name in names(mats)){ 
      for(density in c(0.01)){# exploring importance threshold stringency
        
        if(str_detect(name, "LASSO"))
          edges[[paste0(name, '_', density)]] <-
        weightedLASSO_network(mats[[name]][,saved_lasso], density = density, pwm_occurrence, 
                          saved_lasso, tfs, decreasing = TRUE)
        else
          edges[[paste0(name, '_', density)]] <-
        weightedRF_network(mats[[name]][,saved_rf], density = density, pwm_occurrence, 
                          saved_rf, tfs)
        # lmses[,name] <- mats[[name]]["mse",]
      }
}



settings <- c("model", "dataset", "rep", "density")
val_specific_gain <-
  evaluate_networks(
    edges,
    validation = c("DAPSeq"),
    nCores = 30,
    input_genes = genes,
    input_tfs = tfs
  )
## 5.304 sec elapsed
val_specific_gain[, settings] <-
  str_split_fixed(val_specific_gain$network_name, '_', length(settings))

load("results/gene_specific_validation.rdata")

spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.01 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,0.7))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.01 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,0.7))+ ylab("")


spec_rf_gain <- val_specific_gain %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.01 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,0.7))+ ylab("")

spec_lasso_gain <- val_specific_gain %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.01 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0,0.7))+ ylab("")



spec_lasso_gain + ggtitle("saved lasso") + spec_lasso + ggtitle("all genes lasso") +
spec_rf_gain+ ggtitle("saved rf") +spec_rf+ ggtitle("all genes rf") + plot_layout(ncol = 4)

# save(val_specific, file = "results/gene_specific_validation_min.rdata")
# save(lmses, file = "results/gene_specific_mse_min.rdata") 
# save(edges, file = "results/gene_specific_edges_min.rdata") 

TARGET

load(file = "results/100_permutations_bRF_edges_inf.rdata")

# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_targ <-
  evaluate_networks(
    edges,
    validation = c("TARGET"),
    nCores = 35,
    input_genes = genes,
    input_tfs = tfs
  )
## 188.751 sec elapsed
val_targ[, settings] <-
  str_split_fixed(val_targ$network_name, '_', length(settings))
save(val_targ, file = "results/100_permutations_rf_validation_target_inf.rdata")
load(file = "results/100_permutations_lasso_edges.rdata")

# validation of GRN edges against DAP-Seq
settings <- c("model", "alpha", "dataset", "rep", "density")
val_targ <-
  evaluate_networks(
    edges,
    validation = c("TARGET"),
    nCores = 35,
    input_genes = genes,
    input_tfs = tfs
  )
val_targ[, settings] <-
  str_split_fixed(val_targ$network_name, '_', length(settings))
save(val_targ, file = "results/100_permutations_lasso_validation_target.rdata")
load(file = "results/100_permutations_lasso_validation_target.rdata")

lasso_targ <- draw_validation(validation = val_targ, precision_only = T, densities = 0.005)
load(file = "results/100_permutations_rf_validation_target_inf.rdata")

rf_targ <- draw_validation(validation = val_targ, precision_only = T, densities = 0.005)

load("results/gene_specific_edges.rdata")
settings <- c("model", "dataset", "rep", "density")
val_specific <-
  evaluate_networks(
    edges,
    validation = c("TARGET"),
    nCores = 30,
    input_genes = genes,
    input_tfs = tfs
  )
## 13.315 sec elapsed
val_specific[, settings] <-
  str_split_fixed(val_specific$network_name, '_', length(settings))
save(val_specific, file = "results/gene_specific_validation_target.rdata")

spec_rf <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "RF") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  ) + xlab("") + ylim(c(0.2,0.46))+ ylab("")

spec_lasso <- val_specific %>%
  separate(network_name, into = settings, sep = "_") %>%
    filter(density == 0.005 & model == "LASSO") %>%
  mutate(density = paste("D =", density),
           model = str_replace(model, "RF", "weightedRF"),
           model = str_replace(model, "LASSO", "weightedLASSO"))%>%
  ggplot(aes(x=dataset, y = precision, color = dataset, fill = dataset)) + 
  scale_color_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData")))+
    scale_fill_manual(values = setNames(c("grey", "#70AD47"), c("shuffled", "trueData"))) +
  geom_hline(color = "#C55A11", yintercept = 0.331042, size= 1.5, show.legend = T) + 
  geom_boxplot(size = 1, alpha = 0.5, outlier.alpha = 0) +
  theme_pubr() + geom_jitter(width = 0.2, size = 2)+
  theme(
    strip.background = element_blank(),
    axis.title.x = element_text(size = 22),
    title = element_text(size = 16),
    axis.ticks.x = element_blank(),
    strip.text = element_text(size = 16),
    legend.text = element_text(size = 15),
    axis.text.x = element_blank(),
    legend.position = 'none'
  )+ xlab("")+ ylim(c(0.2,0.46))+ ylab("")


spec_lasso+ylim(0.1, 0.26)+ lasso_targ+ylim(0.1, 0.26) + 
  spec_rf +ylim(0.1, 0.26) + rf_targ +ylim(0.1, 0.26) + plot_layout(ncol = 4)+plot_annotation("TARGET validation")

load("rdata/pwm_prom_jaspar_dap.rdata")
load("rdata/gene_structure.rdata")


load("results/gene_specific_grns.rdata")
load("results/gene_specific_mse.rdata")
load("results/gene_specific_edges.rdata")

mean_expr <- rowMeans(counts)[genes]
var_expr <- matrixStats::rowSds(counts[genes,])*matrixStats::rowSds(counts[genes,])
pwm_prom_n_TFs <- pwm_prom[pwm_prom$TF %in% tfs,]

library(patchwork)

# to comment for new version where mse is already normalized per genes
# norm_mse <- exp(as.matrix(cbind(lmses, lmses_lasso)))/var_expr
genes_info <- data.frame(genes = genes,
                         cluster_rf = alphas_rf,
                         cluster_lasso = alphas_lasso)

genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$nb_motifs <- table(pwm_prom$target)[genes]
genes_info$nb_motifs_n_tfs <- table(pwm_prom_n_TFs$target)[genes]
genes_info$common <- genes %in% common

genes_info[,c("n_introns", "n_transcripts")] <-
  gene_structure[match(genes_info$gene, gene_structure$gene),
                 c("n_introns", "n_transcripts")]


# for rf
genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=log(n_introns))) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of introns for RF groups")) +
genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=n_transcripts)) +
  geom_boxplot(width=0.1, fill = "white")+
  geom_violin(fill="darkblue", alpha=0.2)+
  ggtitle(("Number of transcripts for RF groups")) +
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=nb_motifs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs in promoter for RF groups")) +
  genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=nb_motifs_n_tfs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs of N-responsive TFs in promoter for RF groups")) +
  stat_compare_means() 

genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=var)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
  stat_compare_means()+ genes_info%>%
  ggplot(aes(x=factor(cluster_rf), y=expr)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info %>%
  group_by(cluster_rf) %>%
  summarise(n=n(),
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_rf, y=tf_frac,
                          label = paste("n=",n))) +
  geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
  geom_hline(yintercept = length(tfs)/length(genes)) +
  geom_text(y=0.2) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))

###########" for lasso

genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=log(n_introns))) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of introns for LASSO groups")) +
genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=n_transcripts)) +
  geom_boxplot(width=0.1, fill = "white")+
  geom_violin(fill="darkblue", alpha=0.2)+
  ggtitle(("Number of transcripts for LASSO groups")) +
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=nb_motifs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs in promoter for LASSO groups")) +
  genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=nb_motifs_n_tfs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs of N-responsive TFs in promoter for LASSO groups")) +
  stat_compare_means() 

genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=var)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
  stat_compare_means()+ genes_info%>%
  ggplot(aes(x=factor(cluster_lasso), y=expr)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

genes_info %>%
  group_by(cluster_lasso) %>%
  summarise(n=n(),
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_lasso, y=tf_frac,
                          label = paste("n=",n))) +
  geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
  geom_hline(yintercept = length(tfs)/length(genes)) +
  geom_text(y=0.2) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")+ylim(c(0,0.2))

######## common classes of interest as compared to the rest
length(intersect(common, tfs))/length(common)
## [1] 0.2045455
genes_info%>%
  ggplot(aes(x=common, y=log(n_introns))) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of introns for common genes of interest")) +
genes_info%>%
  ggplot(aes(x=common, y=n_transcripts)) +
  geom_boxplot(width=0.1, fill = "white")+
  geom_violin(fill="darkblue", alpha=0.2)+
  ggtitle(("Number of transcripts for common groups")) +
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=common, y=nb_motifs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs in promoter for common groups")) +
  genes_info%>%
  ggplot(aes(x=common, y=nb_motifs_n_tfs)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Number of motifs of N-responsive TFs in promoter for common groups")) +
  stat_compare_means() 

genes_info%>%
  ggplot(aes(x=common, y=var)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for common groups")) + scale_y_log10()+
  stat_compare_means()+ genes_info%>%
  ggplot(aes(x=common, y=expr)) +
  geom_violin(fill="darkblue", alpha=0.2)+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for common groups")) + scale_y_log10()+
  stat_compare_means()

genes_info %>%
  group_by(common) %>%
  summarise(n=n(),
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=common, y=tf_frac,
                          label = paste("n=",n))) +
  geom_bar(stat = "identity", aes(fill=tf_frac), alpha = 1)+
  geom_hline(yintercept = length(tfs)/length(genes)) +
  geom_text(y=0.2) + xlab("cluster common") +
  ggtitle("Fraction of TFs in common groups")+ylim(c(0,0.22))

phyper(q=length(intersect(common, tfs)), 
       m = length(tfs), 
       n = length(genes) - length(tfs),
       k = length(common), lower.tail = F)
## [1] 0.004688486